This tutorial will introduce you to the standard data processing pipeline used in JiangLab. In short, the pipeline has five main components:

  1. Nuclear channel filter to filter out “fake” cells with minimal nucleus signal
  2. Nuclear channel normalization to decrease the core-to-core variance if any
  3. Arcsinh transformation to make the data less skewed
  4. Trim off extreme/outliers at the lower and upper end of marker distribution
  5. Transform the data to a range of [0,1] by quantile/percentile

In the following sections, these steps will be explored in detail.

Required Libraries

library(tidyverse)
library(matrixStats)
library(ggcorrplot)
#library(isotree)

Example Data

In this tutorial, we will use a MIBI data set which was generated by Yunhao as an example. First, we will load the data using read_csv from the tidyverse package. After reading the dataframe, you should see its size from the environment tab or you can use dim to check yourself. For this particular dataframe, it contains 3335051 cells and 52 variables, within which 46 of them are protein markers.

df <- read_csv('/mnt/nfs/home/huayingqiu/Tutorial/Data_processing/Betty_allRuns_REDSEA_raw.csv')

After reading the dataframe, we will use summary to briefly check the variables in the dataframe. We can see that this dataframe contains several identifiers such as cellLabel, X_cent, Y_cent, and pointNum which can help us identify where the cells come from. The rest of the variables are all protein markers and you can check the basic statistics from the output of summary.

summary(df)
##    cellLabel         X_cent         Y_cent        cellSize    
##  Min.   :    1   Min.   :  25   Min.   :  53   Min.   :  1.0  
##  1st Qu.:10322   1st Qu.: 656   1st Qu.: 647   1st Qu.: 67.0  
##  Median :21716   Median :1267   Median :1249   Median : 94.0  
##  Mean   :22906   Mean   :1283   Mean   :1272   Mean   :105.5  
##  3rd Qu.:33803   3rd Qu.:1902   3rd Qu.:1886   3rd Qu.:131.0  
##  Max.   :68124   Max.   :2611   Max.   :2583   Max.   :933.0  
##       CD45               CD20              dsDNA            pSLP-76         
##  Min.   :  0.0000   Min.   :  0.0000   Min.   : 0.0000   Min.   :  0.00000  
##  1st Qu.:  0.0000   1st Qu.:  0.0000   1st Qu.: 0.0000   1st Qu.:  0.00000  
##  Median :  0.6358   Median :  0.2687   Median : 0.4820   Median :  0.00000  
##  Mean   :  1.0495   Mean   :  0.9320   Mean   : 0.6574   Mean   :  0.09023  
##  3rd Qu.:  1.4710   3rd Qu.:  1.2497   3rd Qu.: 1.0296   3rd Qu.:  0.00000  
##  Max.   :251.2400   Max.   :346.0700   Max.   :59.1460   Max.   :206.97000  
##      SLP-76         anti-H2AX (pS139)       CD163            Histone H3     
##  Min.   : 0.00000   Min.   :   0.0000   Min.   :   0.000   Min.   :   0.00  
##  1st Qu.: 0.00000   1st Qu.:   0.0000   1st Qu.:   0.000   1st Qu.:  62.16  
##  Median : 0.00000   Median :   0.0000   Median :   0.743   Median :  88.05  
##  Mean   : 0.07356   Mean   :   0.1393   Mean   :   1.751   Mean   :  99.51  
##  3rd Qu.: 0.00000   3rd Qu.:   0.0000   3rd Qu.:   1.961   3rd Qu.: 124.62  
##  Max.   :57.20100   Max.   :1530.2000   Max.   :1039.000   Max.   :1026.40  
##      CD45RO             CD28         CD153 (CD30L)            Lag3          
##  Min.   :  0.000   Min.   : 0.0000   Min.   :  0.00000   Min.   :  0.00000  
##  1st Qu.:  0.527   1st Qu.: 0.0000   1st Qu.:  0.00000   1st Qu.:  0.00000  
##  Median :  1.222   Median : 0.0000   Median :  0.00000   Median :  0.00000  
##  Mean   :  1.429   Mean   : 0.2403   Mean   :  0.09724   Mean   :  0.26745  
##  3rd Qu.:  2.058   3rd Qu.: 0.3480   3rd Qu.:  0.00000   3rd Qu.:  0.01653  
##  Max.   :236.760   Max.   :58.6200   Max.   :123.85000   Max.   :272.97000  
##       CD4               CD11c                CD56              FoxP3        
##  Min.   :  0.0000   Min.   :  0.00000   Min.   : 0.00000   Min.   : 0.0000  
##  1st Qu.:  0.1336   1st Qu.:  0.00000   1st Qu.: 0.00000   1st Qu.: 0.0000  
##  Median :  1.1290   Median :  0.02439   Median : 0.00000   Median : 0.4709  
##  Mean   :  1.6895   Mean   :  0.98799   Mean   : 0.04362   Mean   : 0.6202  
##  3rd Qu.:  2.4764   3rd Qu.:  1.17150   3rd Qu.: 0.00000   3rd Qu.: 0.9837  
##  Max.   :428.0600   Max.   :240.53000   Max.   :83.40500   Max.   :24.1220  
##      GATA3            Granzyme B           PD-L1               CD16         
##  Min.   : 0.00000   Min.   :  0.0000   Min.   :  0.0000   Min.   :  0.0000  
##  1st Qu.: 0.00000   1st Qu.:  0.0000   1st Qu.:  0.0000   1st Qu.:  0.0000  
##  Median : 0.00000   Median :  0.0000   Median :  0.0000   Median :  0.0000  
##  Mean   : 0.02499   Mean   :  0.2382   Mean   :  0.2427   Mean   :  0.1663  
##  3rd Qu.: 0.00000   3rd Qu.:  0.0000   3rd Qu.:  0.3265   3rd Qu.:  0.0000  
##  Max.   :62.78100   Max.   :234.2800   Max.   :206.0600   Max.   :239.4700  
##      Ki-67               PD-1              Pax-5               Tox          
##  Min.   :  0.0000   Min.   :  0.0000   Min.   :  0.0000   Min.   :  0.0000  
##  1st Qu.:  0.0000   1st Qu.:  0.0000   1st Qu.:  0.0000   1st Qu.:  0.0000  
##  Median :  0.0000   Median :  0.0000   Median :  0.0000   Median :  0.0000  
##  Mean   :  0.4405   Mean   :  0.1314   Mean   :  0.3242   Mean   :  0.2911  
##  3rd Qu.:  0.0000   3rd Qu.:  0.0502   3rd Qu.:  0.1524   3rd Qu.:  0.3592  
##  Max.   :308.5800   Max.   :674.9900   Max.   :333.6400   Max.   :828.8700  
##      CD161               CD68          B2-Microglobulin        CD8          
##  Min.   :  0.0000   Min.   :  0.0000   Min.   :  0.0000   Min.   : 0.00000  
##  1st Qu.:  0.0000   1st Qu.:  0.0000   1st Qu.:  0.8142   1st Qu.: 0.00000  
##  Median :  0.3155   Median :  0.0000   Median :  1.4270   Median : 0.00000  
##  Mean   :  0.4647   Mean   :  0.6107   Mean   :  1.5959   Mean   : 0.09993  
##  3rd Qu.:  0.7402   3rd Qu.:  0.4374   3rd Qu.:  2.1335   3rd Qu.: 0.00000  
##  Max.   :447.4600   Max.   :186.1700   Max.   :100.0500   Max.   :43.58900  
##       CD3                HLA1              CD15               Tbet         
##  Min.   :  0.0000   Min.   :  0.000   Min.   :  0.0000   Min.   :  0.0000  
##  1st Qu.:  0.0000   1st Qu.:  1.385   1st Qu.:  0.0000   1st Qu.:  0.0000  
##  Median :  0.6947   Median :  2.158   Median :  0.0000   Median :  0.0000  
##  Mean   :  1.2906   Mean   :  2.464   Mean   :  0.5587   Mean   :  0.0205  
##  3rd Qu.:  1.9907   3rd Qu.:  3.150   3rd Qu.:  0.0000   3rd Qu.:  0.0000  
##  Max.   :193.5300   Max.   :115.000   Max.   :539.4600   Max.   :324.8500  
##       CD14              CD123               CXCR5              CD45RA       
##  Min.   :  0.0000   Min.   :   0.0000   Min.   :  0.0000   Min.   : 0.0000  
##  1st Qu.:  0.2929   1st Qu.:   0.0000   1st Qu.:  0.1353   1st Qu.: 0.0000  
##  Median :  1.1771   Median :   0.0000   Median :  0.5525   Median : 0.9176  
##  Mean   :  1.8945   Mean   :   0.0583   Mean   :  0.6933   Mean   : 1.9359  
##  3rd Qu.:  2.4873   3rd Qu.:   0.0000   3rd Qu.:  1.0381   3rd Qu.: 2.9651  
##  Max.   :609.7200   Max.   :1055.0000   Max.   :216.7800   Max.   :70.4190  
##      HLA-DR              CD57              IL-10               CD30         
##  Min.   :  0.0000   Min.   :  0.0000   Min.   : 0.00000   Min.   : 0.00000  
##  1st Qu.:  0.0000   1st Qu.:  0.0000   1st Qu.: 0.00000   1st Qu.: 0.00000  
##  Median :  0.5364   Median :  0.0000   Median : 0.00000   Median : 0.00000  
##  Mean   :  1.3128   Mean   :  0.1937   Mean   : 0.01807   Mean   : 0.02403  
##  3rd Qu.:  1.6740   3rd Qu.:  0.0000   3rd Qu.: 0.00000   3rd Qu.: 0.00000  
##  Max.   :342.5400   Max.   :461.0600   Max.   :61.02300   Max.   :42.69300  
##       TIM3             RORgT              TCRgd               CD86      
##  Min.   : 0.0000   Min.   : 0.00000   Min.   : 0.00000   Min.   :0.000  
##  1st Qu.: 0.0000   1st Qu.: 0.00000   1st Qu.: 0.00000   1st Qu.:0.000  
##  Median : 0.0000   Median : 0.00000   Median : 0.00000   Median :0.000  
##  Mean   : 0.1243   Mean   : 0.03527   Mean   : 0.04543   Mean   :0.044  
##  3rd Qu.: 0.1053   3rd Qu.: 0.00000   3rd Qu.: 0.00000   3rd Qu.:0.000  
##  Max.   :38.0890   Max.   :15.45500   Max.   :36.42500   Max.   :9.557  
##       CD25           Na-K ATPase         pointNum       filename        
##  Min.   :  0.0000   Min.   : 0.0000   Min.   : 3.00   Length:3335051    
##  1st Qu.:  0.0000   1st Qu.: 0.9002   1st Qu.:23.00   Class :character  
##  Median :  0.0000   Median : 1.5445   Median :44.00   Mode  :character  
##  Mean   :  0.2183   Mean   : 1.7849   Mean   :43.76                     
##  3rd Qu.:  0.2248   3rd Qu.: 2.3592   3rd Qu.:65.00                     
##  Max.   :140.0700   Max.   :55.4740   Max.   :83.00

The markers we have in this dataframe are

colnames(df[,5:50])
##  [1] "CD45"              "CD20"              "dsDNA"            
##  [4] "pSLP-76"           "SLP-76"            "anti-H2AX (pS139)"
##  [7] "CD163"             "Histone H3"        "CD45RO"           
## [10] "CD28"              "CD153 (CD30L)"     "Lag3"             
## [13] "CD4"               "CD11c"             "CD56"             
## [16] "FoxP3"             "GATA3"             "Granzyme B"       
## [19] "PD-L1"             "CD16"              "Ki-67"            
## [22] "PD-1"              "Pax-5"             "Tox"              
## [25] "CD161"             "CD68"              "B2-Microglobulin" 
## [28] "CD8"               "CD3"               "HLA1"             
## [31] "CD15"              "Tbet"              "CD14"             
## [34] "CD123"             "CXCR5"             "CD45RA"           
## [37] "HLA-DR"            "CD57"              "IL-10"            
## [40] "CD30"              "TIM3"              "RORgT"            
## [43] "TCRgd"             "CD86"              "CD25"             
## [46] "Na-K ATPase"

Here, let’s examine their distribution before the data is processed

df %>% 
  select(pointNum, all_of(marker_names)[1:6]) %>% 
  # transform the dataframe longer for plotting
  pivot_longer(cols = -c(pointNum), names_to = 'marker', values_to = 'value') %>% 
  ggplot(aes(x = value, fill = as.factor(pointNum))) + 
  geom_histogram(bins = 100, alpha = 0.3) + 
  facet_wrap(~marker, ncol = 3, nrow = 2, scales = 'free') + 
  theme_bw() + 
  theme(legend.position = 'none')

As we can see, the distributions for all those markers are very right skewed. Most of the signals are concentrated at the lower end. This is usually the case for our data, i.e., similar for CODEX data.

We knew that SLP-76, pSLP-76, dsDNA, anti-H2AX (pS139), and CD123 were not working. Therefore, we drop those markers from the dataframe here.

df <- df %>% 
  dplyr::select(-`SLP-76`,
         -`pSLP-76`,
         -dsDNA,
         -`anti-H2AX (pS139)`,
         -CD123)

marker_names <- colnames(df[,5:45])

Nuclear Channel Filter

Since our dataframe is generated based on segmentation masks, we might have a lot of “cells”, which are truely segmentation artifacts, in the dataframe that have none to minimal nuclear marker signal. We want to remove those artifacts first. The first thing to do is naturally to quantify how many observations have 0 nuclear marker signal and filter them out.

# Quantify how many observations have 0 nuclear marker singal

df %>% 
  filter(`Histone H3` == 0) %>% 
  dplyr::count()
## # A tibble: 1 × 1
##       n
##   <int>
## 1   379
# Filter those cells out

df <- df %>%
  filter(`Histone H3` > 0)

Then, we will need to check the distribution of the nuclear channel, Histone H3 in this case, to see if there are any cells with suspiciously low nuclear marker signals. Upon examining the distribution, one thing to note is that we can see the nuclear marker singal for all FOVs align pretty well. That’s a good thing and that’s what you should be seeing in most cases, if the experiment is done well. Of course, there might be some biological factors that can make some FOV’s nuclear signal stronger or weaker. When you see abnormality, always check the raw image data first before you decided if you want to throw out the data or not.

df %>% 
  select(pointNum, `Histone H3`) %>% 
  ggplot(aes(x = `Histone H3`, fill = as.factor(pointNum))) + 
  geom_histogram(bins = 100, alpha = 0.5) + 
  theme_bw() + 
  theme(legend.position = 'none')

df %>% 
  select(pointNum, `Histone H3`) %>% 
  filter(pointNum <= 40) %>% 
  ggplot(aes(x = as.factor(pointNum), y = `Histone H3`, fill = as.factor(pointNum))) + 
  geom_boxplot(outlier.alpha = 0.3) + 
  theme_bw() + 
  theme(legend.position = 'none') + 
  labs(x = 'FOV')

df %>% 
  select(pointNum, `Histone H3`) %>% 
  filter(pointNum > 40) %>% 
  ggplot(aes(x = as.factor(pointNum), y = `Histone H3`, fill = as.factor(pointNum))) + 
  geom_boxplot(outlier.alpha = 0.3) + 
  theme_bw() + 
  theme(legend.position = 'none') + 
  labs(x = 'FOV')

From the boxplot, we can see that most outliers are cells with very bright nuclear marker signal and fewer outliers are those cells with nuclear marker signal below \(Q_{1} - 1.5IQR\), where \(Q_{1}\) is the first quantile or 25th percentile and \(IQR\) is the inter-quantile range, \(Q_{3} - Q_{1}\), the diffrence between the 75th percentile and the 25th percentile. Let’s filter out cells that have nuclear marker signal less than \(Q_{1} - 1.5IQR\). Those outliers should be examined more carefully in combination with other markers because

Only in the 2nd case should we consider removing those cells. Since the marker intensities are not yet normalized and it may not be comparable across different FOVs, we will use DBSCAN within each FOV to detect potential outlier cells within each FOV. However, for the purpose of this tutorial, since we only have 1809 at the lower end, which is only about \(0.05\%\) of the data. We will just drop those cells. And the upper end outliers is generally of less concern. More robust outlier detection method will be updated later.

df %>% 
  group_by(pointNum) %>% 
  mutate(min = quantile(`Histone H3`, 0.25) - 1.5 * IQR(`Histone H3`)) %>% 
  filter(`Histone H3` < min) %>% 
  ungroup() %>% 
  dplyr::count()
## # A tibble: 1 × 1
##       n
##   <int>
## 1  1809
df <- df %>% 
  group_by(pointNum) %>% 
  mutate(min = quantile(`Histone H3`, 0.25) - 1.5 * IQR(`Histone H3`)) %>% 
  filter(`Histone H3` >= min) %>% 
  ungroup()

Median Nuclear Marker Signal Normalization

This step is to correct the core-to-core variation of markers to make the same marker more comparable across different FOVs. As you can see from the correlation plot below, the Histone H3 signal is positively correlated with most markers, which means that if an FOV has higher Histone H3 signal, the signal of other markers of that FOV would also tend to be higher.

mean_marker_df <- df %>% 
  dplyr::select(pointNum, all_of(marker_names)) %>% 
  pivot_longer(cols = -pointNum, names_to = 'marker', values_to = 'value') %>% 
  group_by(pointNum, marker) %>% 
  summarise(marker_mean = mean(value)) %>% 
  ungroup() %>% 
  pivot_wider(id_cols = pointNum, names_from = 'marker', values_from = 'marker_mean')

ggcorrplot(as.matrix(cor(mean_marker_df[,2:42], method = 'spearman')['Histone H3', ])) + 
  theme(axis.text.x = element_text(angle = 90, size = 10),
        axis.text.y = element_blank())

While biologically, nuclear marker signal for all cells should be similar. Therefore, within each FOV, we devide each marker by the median nuclear signal to make the signal intensity between different FOVs more comparable

df_norm <- df

for (i in 1:length(unique(df$pointNum))){
  row_index <- which(df$pointNum == unique(df$pointNum)[i])
  df_norm[row_index, 5:45] <- df[row_index, 5:45]/median(df$`Histone H3`[row_index])
}

Let’s look at the distribution of all the markers after the normalization.

Arcsinh (Inverse Hyperbolic Sine) Transformation

This step will make the distribution of the markers less skewed and create a bimodal distribution whose peaks and neighboring masses can be easily interpreted as the positive and negative cells for that marker. We will show CD3, CD4, CD8, and CD20 here as an example.

markers_to_trans <- c('CD3')

df_norm %>%
  dplyr::select(pointNum, all_of(markers_to_trans)) %>% 
  mutate(
    across(
      all_of(markers_to_trans),
      list(
        "1000" = ~ asinh(.x / 1000),
        "100" = ~ asinh(.x / 100),
        "10" = ~ asinh(.x / 10),
        "0.1" = ~ asinh(.x / 0.1),
        "0.01" = ~ asinh(.x / 0.01),
        "0.001" = ~ asinh(.x / 0.001)
      ),
      .names = "{.col}_{.fn}"
    )
  ) %>% 
  pivot_longer(cols = -c('pointNum'), names_to = 'marker', values_to = 'value') %>% 
  ggplot(aes(x = value, color = as.factor(pointNum))) +
  geom_histogram(bins = 100, alpha = 0.3) + 
  facet_wrap(~marker, scales = 'free') + 
  theme_bw() + 
  theme(legend.position = 'none')

markers_to_trans <- c('CD4')

df_norm %>%
  dplyr::select(pointNum, all_of(markers_to_trans)) %>% 
  mutate(
    across(
      all_of(markers_to_trans),
      list(
        "1000" = ~ asinh(.x / 1000),
        "100" = ~ asinh(.x / 100),
        "10" = ~ asinh(.x / 10),
        "0.1" = ~ asinh(.x / 0.1),
        "0.01" = ~ asinh(.x / 0.01),
        "0.001" = ~ asinh(.x / 0.001)
      ),
      .names = "{.col}_{.fn}"
    )
  ) %>% 
  pivot_longer(cols = -c('pointNum'), names_to = 'marker', values_to = 'value') %>% 
  ggplot(aes(x = value, color = as.factor(pointNum))) +
  geom_histogram(bins = 100, alpha = 0.3) + 
  facet_wrap(~marker, scales = 'free') + 
  theme_bw() + 
  theme(legend.position = 'none')

markers_to_trans <- c('CD8')

df_norm %>%
  dplyr::select(pointNum, all_of(markers_to_trans)) %>% 
  mutate(
    across(
      all_of(markers_to_trans),
      list(
        "1000" = ~ asinh(.x / 1000),
        "100" = ~ asinh(.x / 100),
        "10" = ~ asinh(.x / 10),
        "0.1" = ~ asinh(.x / 0.1),
        "0.01" = ~ asinh(.x / 0.01),
        "0.001" = ~ asinh(.x / 0.001)
      ),
      .names = "{.col}_{.fn}"
    )
  ) %>% 
  pivot_longer(cols = -c('pointNum'), names_to = 'marker', values_to = 'value') %>% 
  ggplot(aes(x = value, color = as.factor(pointNum))) +
  geom_histogram(bins = 100, alpha = 0.3) + 
  facet_wrap(~marker, scales = 'free') + 
  theme_bw() + 
  theme(legend.position = 'none')

markers_to_trans <- c('CD20')

df_norm %>%
  dplyr::select(pointNum, all_of(markers_to_trans)) %>% 
  mutate(
    across(
      all_of(markers_to_trans),
      list(
        "1000" = ~ asinh(.x / 1000),
        "100" = ~ asinh(.x / 100),
        "10" = ~ asinh(.x / 10),
        "0.1" = ~ asinh(.x / 0.1),
        "0.01" = ~ asinh(.x / 0.01),
        "0.001" = ~ asinh(.x / 0.001)
      ),
      .names = "{.col}_{.fn}"
    )
  ) %>% 
  pivot_longer(cols = -c('pointNum'), names_to = 'marker', values_to = 'value') %>% 
  ggplot(aes(x = value, color = as.factor(pointNum))) +
  geom_histogram(bins = 100, alpha = 0.3) + 
  facet_wrap(~marker, scales = 'free') + 
  theme_bw() + 
  theme(legend.position = 'none')

For this dataframe, the cofactor of 0.001 seems to be optimal. Hence, we will proceed with the arcsinh transformed data with cofactor 0.001.

df_arcsinh <- df_norm %>% 
  mutate(across(all_of(marker_names), ~asinh(.x/0.1)))

Universal Percentile Normalization

This step will transform the data to have range \(\left[0,1\right]\) based on a pair of chosen percentiles \(\left(P_{low}, P_{high}\right)\).

\[ X_{norm} = \frac{X - X_{P_{low}}}{X_{P_{high}} - X_{P_{low}}} \]

df_trans <- df_arcsinh

rng <- colQuantiles(as.matrix(df_arcsinh[,5:45]), probs = c(0.001, 0.999))

expr <- t((t(as.matrix(df_arcsinh[,5:45]))-rng[,1]) / (rng[,2]-rng[,1]))

expr[expr < 0] <- 0

expr[expr > 1] <- 1

df_trans[,5:45] <- expr

Now, let’s take a look at the distribution of the markers after the normalization.

Percentile Normalization

This step will transform the data to have range \(\left[0,1\right]\) based on a pair of chosen percentiles \(\left(P_{low}, P_{high}\right)\). \[ X_{norm} = \frac{X - X_{P_{low}}}{X_{P_{high}} - X_{P_{low}}} \]

df_arcsinh %>% 
  filter(pointNum == 4) %>% 
  mutate(CD20 = ifelse(CD3 == 0, NA, CD3)) %>% 
  ggplot(aes(x = X_cent, y = Y_cent, color = CD3)) + 
  geom_point(alpha = 1) + 
  scale_color_gradient(low = 'white', high = 'red', na.value = NA) + 
  theme_bw() + 
  theme(panel.grid = element_blank(),
        panel.border = element_blank())

As for tunning the lower and upper bound for difficult markers, unfortunately, the current way is to set some initial values to start with and compare the signal image to the raw image, with your adjustments to show only the true signals. If the two corresponds very well, you are done. Otherwise, you will know if you need to allow for higher signals or you need to cut out more lower end signals by looking at them. The following are some examples to show you how the lower and upper bound can affect the signal image that the computer will be “seeing”. Sometimes, different \(\left(P_{low}, P_{high}\right)\) need to be set for different markers. Therefore, you should check all the markers that you care about before you proceed to clustering and all the downstream analysis.

[0.001, 0.999]

df_trans <- df_arcsinh

rng <- colQuantiles(as.matrix(df_arcsinh[,5:45]), probs = c(0.001, 0.999))

expr <- t((t(as.matrix(df_arcsinh[,5:45]))-rng[,1]) / (rng[,2]-rng[,1]))

expr[expr < 0] <- 0

expr[expr > 1] <- 1

df_trans[,5:45] <- expr

df_trans %>% 
  filter(pointNum == 4) %>% 
  mutate(CD3 = ifelse(CD3 == 0, NA, CD3)) %>% 
  ggplot(aes(x = X_cent, y = Y_cent, color = CD3)) + 
  geom_point(alpha = 1) + 
  scale_color_gradient(low = 'white', high = 'red', na.value = NA) + 
  theme_bw() + 
  theme(panel.grid = element_blank(),
        panel.border = element_blank())

[0.1, 0.999]

df_trans <- df_arcsinh

rng <- colQuantiles(as.matrix(df_arcsinh[,5:45]), probs = c(0.1, 0.999))

expr <- t((t(as.matrix(df_arcsinh[,5:45]))-rng[,1]) / (rng[,2]-rng[,1]))

expr[expr < 0] <- 0

expr[expr > 1] <- 1

df_trans[,5:45] <- expr

df_trans %>% 
  filter(pointNum == 4) %>% 
  mutate(CD3 = ifelse(CD3 == 0, NA, CD3)) %>% 
  ggplot(aes(x = X_cent, y = Y_cent, color = CD3)) + 
  geom_point(alpha = 1) + 
  scale_color_gradient(low = 'white', high = 'red', na.value = NA) + 
  theme_bw() + 
  theme(panel.grid = element_blank(),
        panel.border = element_blank())

[0.3, 0.999]

df_trans <- df_arcsinh

rng <- colQuantiles(as.matrix(df_arcsinh[,5:45]), probs = c(0.3, 0.999))

expr <- t((t(as.matrix(df_arcsinh[,5:45]))-rng[,1]) / (rng[,2]-rng[,1]))

expr[expr < 0] <- 0

expr[expr > 1] <- 1

df_trans[,5:45] <- expr

df_trans %>% 
  filter(pointNum == 4) %>% 
  mutate(CD3 = ifelse(CD3 == 0, NA, CD3)) %>% 
  ggplot(aes(x = X_cent, y = Y_cent, color = CD3)) + 
  geom_point(alpha = 1) + 
  scale_color_gradient(low = 'white', high = 'red', na.value = NA) + 
  theme_bw() + 
  theme(panel.grid = element_blank(),
        panel.border = element_blank())

[0.4, 0.999]

df_trans <- df_arcsinh

rng <- colQuantiles(as.matrix(df_arcsinh[,5:45]), probs = c(0.4, 0.999))

expr <- t((t(as.matrix(df_arcsinh[,5:45]))-rng[,1]) / (rng[,2]-rng[,1]))

expr[expr < 0] <- 0

expr[expr > 1] <- 1

df_trans[,5:45] <- expr

df_trans %>% 
  filter(pointNum == 4) %>% 
  mutate(CD3 = ifelse(CD3 == 0, NA, CD3)) %>% 
  ggplot(aes(x = X_cent, y = Y_cent, color = CD3)) + 
  geom_point(alpha = 1) + 
  scale_color_gradient(low = 'white', high = 'red', na.value = NA) + 
  theme_bw() + 
  theme(panel.grid = element_blank(),
        panel.border = element_blank())

[0.5, 0.999]

df_trans <- df_arcsinh

rng <- colQuantiles(as.matrix(df_arcsinh[,5:45]), probs = c(0.5, 0.999))

expr <- t((t(as.matrix(df_arcsinh[,5:45]))-rng[,1]) / (rng[,2]-rng[,1]))

expr[expr < 0] <- 0

expr[expr > 1] <- 1

df_trans[,5:45] <- expr

df_trans %>% 
  filter(pointNum == 4) %>% 
  mutate(CD3 = ifelse(CD3 == 0, NA, CD3)) %>% 
  ggplot(aes(x = X_cent, y = Y_cent, color = CD3)) + 
  geom_point(alpha = 1) + 
  scale_color_gradient(low = 'white', high = 'red', na.value = NA) + 
  theme_bw() + 
  theme(panel.grid = element_blank(),
        panel.border = element_blank())

[0.6, 0.999]

df_trans <- df_arcsinh

rng <- colQuantiles(as.matrix(df_arcsinh[,5:45]), probs = c(0.6, 0.999))

expr <- t((t(as.matrix(df_arcsinh[,5:45]))-rng[,1]) / (rng[,2]-rng[,1]))

expr[expr < 0] <- 0

expr[expr > 1] <- 1

df_trans[,5:45] <- expr

df_trans %>% 
  filter(pointNum == 4) %>% 
  mutate(CD3 = ifelse(CD3 == 0, NA, CD3)) %>% 
  ggplot(aes(x = X_cent, y = Y_cent, color = CD3)) + 
  geom_point(alpha = 1) + 
  scale_color_gradient(low = 'white', high = 'red', na.value = NA) + 
  theme_bw() + 
  theme(panel.grid = element_blank(),
        panel.border = element_blank())

[0.7, 0.999]

df_trans <- df_arcsinh

rng <- colQuantiles(as.matrix(df_arcsinh[,5:45]), probs = c(0.7, 0.999))

expr <- t((t(as.matrix(df_arcsinh[,5:45]))-rng[,1]) / (rng[,2]-rng[,1]))

expr[expr < 0] <- 0

expr[expr > 1] <- 1

df_trans[,5:45] <- expr

df_trans %>% 
  filter(pointNum == 4) %>% 
  mutate(CD3 = ifelse(CD3 == 0, NA, CD3)) %>% 
  ggplot(aes(x = X_cent, y = Y_cent, color = CD3)) + 
  geom_point(alpha = 1) + 
  scale_color_gradient(low = 'white', high = 'red', na.value = NA) + 
  theme_bw() + 
  theme(panel.grid = element_blank(),
        panel.border = element_blank())

[0.8, 0.999]

df_trans <- df_arcsinh

rng <- colQuantiles(as.matrix(df_arcsinh[,5:45]), probs = c(0.8, 0.999))

expr <- t((t(as.matrix(df_arcsinh[,5:45]))-rng[,1]) / (rng[,2]-rng[,1]))

expr[expr < 0] <- 0

expr[expr > 1] <- 1

df_trans[,5:45] <- expr

df_trans %>% 
  filter(pointNum == 4) %>% 
  mutate(CD3 = ifelse(CD3 == 0, NA, CD3)) %>% 
  ggplot(aes(x = X_cent, y = Y_cent, color = CD3)) + 
  geom_point(alpha = 1) + 
  scale_color_gradient(low = 'white', high = 'red', na.value = NA) + 
  theme_bw() + 
  theme(panel.grid = element_blank(),
        panel.border = element_blank())